# Title: Measuring Interethnic Marriage in Africa
# Author: Branden Bohrnsen, bohrnsen@umich.edu
# Date: 1/30/2026
# Description: Generate Online Appendix Tables A1–A4.

library(tidyverse)
library(here)

weighted_se <- function(x, w, na.rm = TRUE) {
  if (na.rm) {
    complete_cases <- !is.na(x) & !is.na(w)
    x <- x[complete_cases]
    w <- w[complete_cases]
  }

  n <- length(x)
  if (n <= 1) {
    return(NA)
  }

  weighted_mean <- sum(w * x) / sum(w)
  weighted_var <- sum(w * (x - weighted_mean) ^ 2) / sum(w)
  effective_n <- (sum(w)) ^ 2 / sum(w ^ 2)
  sqrt(weighted_var / effective_n)
}

if (!dir.exists(here("out", "final"))) {
  dir.create(here("out", "final"), recursive = TRUE)
}

if (!dir.exists(here("figures"))) {
  dir.create(here("figures"), recursive = TRUE)
}

marriages_raw <- read_csv(here("data", "clean", "sample.csv")) %>%
  mutate(across(where(is.factor), as.character)) %>%
  mutate(
    Urban = Urbanization >= 0.5,
    Intergroup = Ethnicity_Head != Ethnicity_Spouse,
    Interblock = EthnicBlock_Head != EthnicBlock_Spouse
  )

n_gc_ethnicity <- read_csv(here("out", "inter", "n_gc_ethnicity.csv"))
n_gc_ethnic_block <- read_csv(here("out", "inter", "n_gc_ethnic_block.csv"))

compute_group_share_market <- function(marriages, group_head, intermarry, n_gc, n_gc_key, group_type_label) {
  marriages_counts <- marriages %>%
    filter(
      !is.na(.data[[group_head]]),
      !is.na(Const),
      !is.na(District),
      !is.na(DecadeMarried)
    ) %>%
    group_by(Const, District, DecadeMarried) %>%
    summarise(total_marriages = n(), .groups = "drop")

  r_gc <- marriages %>%
    filter(
      !is.na(.data[[group_head]]),
      !is.na(Const),
      !is.na(District),
      !is.na(DecadeMarried)
    ) %>%
    group_by(.data[[group_head]], Const, District, DecadeMarried) %>%
    summarise(r_gc = mean(.data[[intermarry]], na.rm = TRUE), .groups = "drop")

  n_c <- n_gc %>%
    group_by(Const, District, Decade) %>%
    summarise(n_c = sum(n_gc, na.rm = TRUE), .groups = "drop")

  p_gc <- n_gc %>%
    inner_join(n_c, by = c("Const", "District", "Decade")) %>%
    mutate(p_gc = n_gc / n_c)

  combined <- r_gc %>%
    inner_join(
      p_gc,
      by = c(
        setNames(n_gc_key, group_head),
        "Const",
        "District",
        DecadeMarried = "Decade"
      )
    )

  i_market <- combined %>%
    filter(p_gc < 1) %>%
    mutate(
      component = (r_gc / (1 - p_gc)) * p_gc,
      expectation = 1 - p_gc
    ) %>%
    group_by(Const, District, DecadeMarried) %>%
    summarise(
      I_group_share = sum(component, na.rm = TRUE),
      avg_observed_rate = weighted.mean(r_gc, w = n_gc, na.rm = TRUE),
      avg_observed_rate_se = weighted_se(r_gc, w = n_gc, na.rm = TRUE),
      avg_expectation = weighted.mean(expectation, w = n_gc, na.rm = TRUE),
      avg_expectation_se = weighted_se(expectation, w = n_gc, na.rm = TRUE),
      total_n_census = sum(n_gc, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    left_join(marriages_counts, by = c("Const", "District", "DecadeMarried")) %>%
    mutate(Group_Type = group_type_label)

  i_market
}

aggregate_rates <- function(i_market, by_cols = character()) {
  i_market %>%
    group_by(across(all_of(by_cols))) %>%
    summarise(
      Observed = weighted.mean(avg_observed_rate, w = total_n_census, na.rm = TRUE),
      Observed_SE = weighted_se(avg_observed_rate, w = total_n_census, na.rm = TRUE),
      Adjusted = weighted.mean(I_group_share, w = total_n_census, na.rm = TRUE),
      Adjusted_SE = weighted_se(I_group_share, w = total_n_census, na.rm = TRUE),
      .groups = "drop"
    )
}

build_appendix_table <- function(marriages) {
  i_eth <- compute_group_share_market(
    marriages = marriages,
    group_head = "Ethnicity_Head",
    intermarry = "Intergroup",
    n_gc = n_gc_ethnicity,
    n_gc_key = "Ethnicity",
    group_type_label = "Ethnic Census Category"
  )

  i_block <- compute_group_share_market(
    marriages = marriages,
    group_head = "EthnicBlock_Head",
    intermarry = "Interblock",
    n_gc = n_gc_ethnic_block,
    n_gc_key = "EthnicBlock",
    group_type_label = "Ethnic Block"
  )

  urban_rural_lookup <- marriages %>%
    select(Const, District, DecadeMarried, Urban) %>%
    distinct() %>%
    mutate(UrbanArea = if_else(Urban == TRUE, "Urban", "Rural"))

  eth_with_urban <- i_eth %>%
    left_join(urban_rural_lookup, by = c("Const", "District", "DecadeMarried"))

  block_with_urban <- i_block %>%
    left_join(urban_rural_lookup, by = c("Const", "District", "DecadeMarried"))

  valid_keys_for_n <- n_gc_ethnicity %>%
    distinct(Ethnicity, Const, District, Decade)

  marriages_for_n <- marriages %>%
    filter(
      !is.na(Ethnicity_Head),
      !is.na(Const),
      !is.na(District),
      !is.na(DecadeMarried)
    ) %>%
    inner_join(
      valid_keys_for_n,
      by = c(
        Ethnicity_Head = "Ethnicity",
        "Const",
        "District",
        DecadeMarried = "Decade"
      )
    )

  counts_countrywide <- tibble(Row = "Countrywide", N_couples = nrow(marriages_for_n))

  counts_decade <- marriages_for_n %>%
    count(DecadeMarried, name = "N_couples") %>%
    transmute(Row = DecadeMarried, N_couples)

  counts_urban <- marriages_for_n %>%
    mutate(UrbanArea = if_else(Urban == TRUE, "Urban", "Rural")) %>%
    count(UrbanArea, name = "N_couples") %>%
    transmute(Row = paste0(UrbanArea, " areas"), N_couples)

  counts_all <- bind_rows(counts_countrywide, counts_decade, counts_urban)

  eth_rates <- bind_rows(
    aggregate_rates(eth_with_urban) %>% mutate(Row = "Countrywide"),
    aggregate_rates(eth_with_urban, by_cols = c("DecadeMarried")) %>% mutate(Row = DecadeMarried),
    aggregate_rates(eth_with_urban, by_cols = c("UrbanArea")) %>% mutate(Row = paste0(UrbanArea, " areas"))
  ) %>%
    select(Row, Observed, Adjusted) %>%
    mutate(
      Observed = round(Observed, 3),
      Adjusted = round(Adjusted, 3)
    ) %>%
    rename(
      Observed_Ethnic_Census_Category = Observed,
      Adjusted_Ethnic_Census_Category = Adjusted
    )

  block_rates <- bind_rows(
    aggregate_rates(block_with_urban) %>% mutate(Row = "Countrywide"),
    aggregate_rates(block_with_urban, by_cols = c("DecadeMarried")) %>% mutate(Row = DecadeMarried),
    aggregate_rates(block_with_urban, by_cols = c("UrbanArea")) %>% mutate(Row = paste0(UrbanArea, " areas"))
  ) %>%
    select(Row, Observed, Adjusted) %>%
    mutate(
      Observed = round(Observed, 3),
      Adjusted = round(Adjusted, 3)
    ) %>%
    rename(
      Observed_Ethnic_Block = Observed,
      Adjusted_Ethnic_Block = Adjusted
    )

  counts_all %>%
    left_join(eth_rates, by = "Row") %>%
    left_join(block_rates, by = "Row") %>%
    arrange(match(Row, c(
      "Countrywide",
      "1951-1960",
      "1961-1970",
      "1971-1980",
      "1981-1990",
      "1991-2000",
      "2001-2010",
      "Urban areas",
      "Rural areas"
    )))
}

marriages_a1 <- marriages_raw %>%
  filter(FullReside == T)

table_a1 <- build_appendix_table(marriages_a1)

marriages_a2 <- marriages_raw %>%
  filter(FullReside == 1) %>%
  filter(Polygamous_Head == "No more than one wife linked via SPLOC") %>%
  filter(!is.na(AgeMarried_Spouse), AgeMarried_Spouse <= 29)

table_a2 <- build_appendix_table(marriages_a2)

marriages_a3 <- marriages_raw %>%
  filter(Polygamous_Head == "No more than one wife linked via SPLOC")

table_a3 <- build_appendix_table(marriages_a3)

write_csv(
  table_a1,
  here("out", "final", "Table_A1_Intermarriage_Rates_in_Zambia_Including_Polygamous_Marriages.csv")
)
write_csv(
  table_a1,
  here("figures", "Table_A1_Intermarriage_Rates_in_Zambia_Including_Polygamous_Marriages.csv")
)

write_csv(
  table_a2,
  here("out", "final", "Table_A2_Intermarriage_Rates_in_Zambia_First_Marriages_Only.csv")
)
write_csv(
  table_a2,
  here("figures", "Table_A2_Intermarriage_Rates_in_Zambia_First_Marriages_Only.csv")
)

write_csv(
  table_a3,
  here("out", "final", "Table_A3_Intermarriage_Rates_in_Zambia_Including_Couples_Married_Before_Moving.csv")
)
write_csv(
  table_a3,
  here("figures", "Table_A3_Intermarriage_Rates_in_Zambia_Including_Couples_Married_Before_Moving.csv")
)

blocks <- readxl::read_excel(here("data", "original", "ethnic group blocks.xlsx")) %>%
  select(EthnicGroup = "Census category", EthnicBlock = "Ethnic block") %>%
  na.omit()

ipums_ddi <- ipumsr::read_ipums_ddi(here("data", "ipums", "ipumsi_00003.xml"))
ipums_raw <- ipumsr::read_ipums_micro(ipums_ddi, vars = c("YEAR", "ETHNICZM", "PERWT"))

ipums_census <- tibble(
  CensusYear = ipums_raw$YEAR,
  Ethnicity = as.character(haven::as_factor(ipums_raw$ETHNICZM)),
  Weight = as.numeric(ipums_raw$PERWT) / 100
) %>%
  filter(CensusYear %in% c(2000, 2010))

ipums_census <- ipums_census %>%
  left_join(blocks, by = c("Ethnicity" = "EthnicGroup")) %>%
  filter(!is.na(EthnicBlock))

ethnicity_shares <- ipums_census %>%
  group_by(CensusYear, Ethnicity) %>%
  summarise(w = sum(Weight, na.rm = TRUE), .groups = "drop") %>%
  group_by(CensusYear) %>%
  mutate(p = w / sum(w, na.rm = TRUE)) %>%
  ungroup() %>%
  select(CensusYear, Ethnicity, p)

ethnicity_to_block <- ipums_census %>%
  group_by(Ethnicity, EthnicBlock) %>%
  summarise(w = sum(Weight, na.rm = TRUE), .groups = "drop") %>%
  group_by(Ethnicity) %>%
  slice_max(order_by = w, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  select(Ethnicity, EthnicBlock) %>%
  mutate(EthnicBlock = if_else(EthnicBlock == "Drop", "Dropped", EthnicBlock))

table_a4 <- ethnicity_to_block %>%
  left_join(ethnicity_shares, by = "Ethnicity") %>%
  pivot_wider(
    names_from = CensusYear,
    values_from = p,
    names_prefix = "Percent_"
  ) %>%
  mutate(
    Percent_2000 = format(round(Percent_2000, 3), nsmall = 3),
    Percent_2010 = format(round(Percent_2010, 3), nsmall = 3)
  ) %>%
  arrange(match(EthnicBlock, c(sort(unique(EthnicBlock[EthnicBlock != "Dropped"])), "Dropped")), desc(as.numeric(Percent_2010)), desc(as.numeric(Percent_2000)), Ethnicity) %>%
  rename(
    Ethnic_Census_Category = Ethnicity,
    Ethnic_Block = EthnicBlock
  )

write_csv(
  table_a4,
  here("out", "final", "Table_A4_Ethnic_Census_Categories_and_Ethnic_Blocks.csv")
)
write_csv(
  table_a4,
  here("figures", "Table_A4_Ethnic_Census_Categories_and_Ethnic_Blocks.csv")
)

cat(
  "\nAppendix tables saved to out/final and figures/\n",
  "- Table_A1_Intermarriage_Rates_in_Zambia_Including_Polygamous_Marriages.csv\n",
  "- Table_A2_Intermarriage_Rates_in_Zambia_First_Marriages_Only.csv\n",
  "- Table_A3_Intermarriage_Rates_in_Zambia_Including_Couples_Married_Before_Moving.csv\n",
  "- Table_A4_Ethnic_Census_Categories_and_Ethnic_Blocks.csv\n"
)
